This R markdown covers the Exploratory Data Analysis (EDA) on the training dataset of the PRD.
## Loading the needed libraries
library(kableExtra) # help you build common complex tables and manipulate table styles
library(tidyverse) # for general data wrangling (includes readr and dplyr)
library(ggplot2) # to draw statistical plots
library(plotly) # to construct interactive 3d plots
library(DataExplorer) # automated data exploration
library(corrplot) # to plot nice correlation matrix
library(caret) # includes several functions to pre-process
library(scales) # to determining breaks and labels for axes and legends
library(skimr)
library(funModeling)
library(Hmisc)
library(grid)
library(hrbrthemes)
library(tidyr)
library(viridis)
library(ggpubr)
library(ggthemes)
library(GGally)
library(nortest)## Loading the training dataset
load("~/GitHub/ff-beta-release-matching/poc/EDA/data_milestone2_df_train_validate_20191025.RData")## View train dataframe
kable(head(df_train_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| client_id | num_active_days | content_crashes | active_hours | uri_count | session_length | search_count | num_bookmarks | num_pages | daily_unique_domains | daily_max_tabs | daily_tabs_opened | startup_ms | daily_num_sessions_started | active_hours_max | uri_count_max | session_length_max | search_count_max | num_pages_max | daily_unique_domains_max | daily_max_tabs_max | daily_tabs_opened_max | startup_ms_max | daily_num_sessions_started_max | label | install_year | profile_age | fxa_configured | sync_configured | is_default_browser | locale | normalized_channel | app_version | default_search_engine | country | timezone_offset | num_addons | cpu_cores | cpu_speed_mhz | cpu_l2_cache_kb | cpu_vendor | memory_mb | os_version | is_wow64 | FX_PAGE_LOAD_MS_2_PARENT | TIME_TO_DOM_COMPLETE_MS | TIME_TO_DOM_CONTENT_LOADED_END_MS | TIME_TO_LOAD_EVENT_END_MS | TIME_TO_DOM_INTERACTIVE_MS | TIME_TO_NON_BLANK_PAINT_MS | profile_age_cat | distro_id_norm | timezone_cat | memory_cat | cpu_speed_cat | cpu_cores_cat | is_release | cpu_l2_cache_kb_cat |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 001cf926-92e3-4587-887e-d3156ba24d82 | 8 | 0 | 1.4215278 | 76.1250 | 22.9337499 | 1.875 | 11.00 | 4014.5 | 6.062500 | 7.00000 | 15.00 | 54176.2500 | 0.6250000 | 3.3208333 | 139 | 37.007500 | 6 | 15464 | 17.000000 | 11 | 29 | 180095 | 2 | beta | 2016 | 1160 | False | False | True | en-US | beta | 67 | DuckDuckGo | US | -240 | 8 | 2 | 2527 | 256 | Intel | 4022 | 6.1 | False | 4223.089 | 5220.036 | 9079.136 | 5221.752 | 6157.840 | 5198.1300 | < 5 years | Mozilla | (-6,-4] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 00210163-2123-427e-bb73-398bda9f9eba | 5 | 0 | 0.8305556 | 168.2000 | 2.4390556 | 0.800 | 248.75 | 20599.5 | 7.866667 | 3.40000 | 14.80 | 3164.0667 | 1.2000000 | 1.6708333 | 325 | 8.318333 | 3 | 20719 | 17.000000 | 4 | 33 | 4966 | 2 | beta | 2016 | 1079 | False | False | False | en-US | beta | 67 | DuckDuckGo | GB | 60 | 6 | 2 | 2394 | 256 | Intel | 3810 | 6.1 | False | 2148.350 | 2253.526 | 1159.979 | 2146.827 | 1155.050 | 1015.9784 | < 5 years | Mozilla | (0,2] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 0024fd24-4ef5-4771-850a-9e3846597015 | 2 | 0 | 0.5111111 | 82.0000 | 0.8712505 | 2.500 | 9.00 | 87.0 | 2.166667 | 4.00000 | 8.00 | 23977.9444 | 5.0000000 | 0.8250000 | 145 | 1.464445 | 5 | 87 | 3.333333 | 6 | 15 | 31918 | 9 | beta | 2019 | 745 | False | False | True | en-US | beta | 67 | GB | 60 | 6 | 4 | 2394 | 256 | Intel | 8124 | 10.0 | False | 2699.834 | 2216.994 | 1832.642 | 2119.277 | 1819.612 | 1979.7910 | < 5 years | Mozilla | (0,2] | < 16GB | < 3GHz | < 4 | FALSE | < 256 | |
| 004f70f7-2576-4de5-94b4-5bf1acdca0a8 | 8 | 0 | 0.3946181 | 101.8750 | 4.6785415 | 7.750 | 87.00 | 8882.0 | 7.750000 | 9.25000 | 19.50 | 1703.3125 | 1.3750000 | 1.1625000 | 210 | 9.174722 | 10 | 9044 | 13.000000 | 12 | 37 | 3454 | 2 | beta | 2018 | 130 | True | True | True | en-US | beta | 67 | US | -240 | 10 | 4 | 3991 | 256 | Intel | 16235 | 10.0 | False | 2370.563 | 2368.195 | 1614.073 | 2356.307 | 1497.652 | 850.0551 | < 6 months | Mozilla | (-6,-4] | < 16GB | < 4GHz | < 4 | FALSE | < 256 | |
| 007c0c11-38e4-476b-a494-d732e15ac159 | 4 | 0 | 0.5930556 | 167.5000 | 5.6081245 | 0.250 | 13.00 | 3024.0 | 1.775000 | 8.75000 | 13.75 | 15285.5375 | 2.2500000 | 0.8791667 | 255 | 9.059722 | 1 | 3552 | 2.500000 | 27 | 36 | 24592 | 3 | beta | 2018 | 293 | False | False | False | en-US | beta | 67 | US | 360 | 7 | 2 | 2659 | 3072 | Intel | 3317 | 10.0 | False | 4050.417 | 5106.575 | 3630.348 | 5041.016 | 3618.008 | 2006.3376 | < 2 years | Mozilla | (4,6] | < 4GB | < 3GHz | 2 | FALSE | > 1024 | |
| 0294837f-c98f-44ab-8237-30d2eba6c55a | 6 | 0 | 1.6333333 | 165.8333 | 32.4127778 | 5.500 | 17.00 | 7740.7 | 8.583333 | 10.33333 | 20.00 | 975.2222 | 0.1666667 | 3.2319444 | 323 | 41.553611 | 12 | 7975 | 13.000000 | 18 | 35 | 1088 | 1 | beta | 2019 | 502 | False | False | True | en-US | beta | 67 | other (non-bundled) | GB | 60 | 7 | 4 | 1800 | 256 | Intel | 8026 | 10.0 | False | 2740.793 | 2170.362 | 1482.620 | 2033.104 | 1161.901 | 1034.0580 | < 2 years | Mozilla | (0,2] | < 16GB | < 2GHz | < 4 | FALSE | < 256 |
## View train dataframe
kable(head(df_validate_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| client_id | num_active_days | content_crashes | active_hours | uri_count | session_length | search_count | num_bookmarks | num_pages | daily_unique_domains | daily_max_tabs | daily_tabs_opened | startup_ms | daily_num_sessions_started | active_hours_max | uri_count_max | session_length_max | search_count_max | num_pages_max | daily_unique_domains_max | daily_max_tabs_max | daily_tabs_opened_max | startup_ms_max | daily_num_sessions_started_max | label | install_year | profile_age | fxa_configured | sync_configured | is_default_browser | locale | normalized_channel | app_version | default_search_engine | country | timezone_offset | num_addons | cpu_cores | cpu_speed_mhz | cpu_l2_cache_kb | cpu_vendor | memory_mb | os_version | is_wow64 | FX_PAGE_LOAD_MS_2_PARENT | TIME_TO_DOM_COMPLETE_MS | TIME_TO_DOM_CONTENT_LOADED_END_MS | TIME_TO_LOAD_EVENT_END_MS | TIME_TO_DOM_INTERACTIVE_MS | TIME_TO_NON_BLANK_PAINT_MS | profile_age_cat | distro_id_norm | timezone_cat | memory_cat | cpu_speed_cat | cpu_cores_cat | is_release | cpu_l2_cache_kb_cat |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 001cf926-92e3-4587-887e-d3156ba24d82 | 8 | 0 | 1.5369792 | 71.12500 | 23.6797916 | 3.375000 | 11 | 1890.571 | 9.216667 | 8.500000 | 16.625 | 9928.483 | 0.6250000 | 2.3250000 | 120 | 34.020000 | 6 | 2094 | 20 | 14 | 31 | 17491.667 | 3 | beta | 2016 | 1204 | False | False | True | en-US | beta | 68 | DuckDuckGo | US | -240 | 7 | 2 | 2527 | 256 | Intel | 4022 | 6.1 | False | 3133.947 | 3713.308 | 3748.471 | 3730.944 | 2444.799 | 1972.6632 | < 5 years | Mozilla | (-6,-4] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 00210163-2123-427e-bb73-398bda9f9eba | 2 | 0 | 0.1833333 | 43.50000 | 0.5619445 | 1.500000 | 259 | 22005.000 | 5.000000 | 3.000000 | 6.000 | 5413.500 | 1.0000000 | 0.2152778 | 48 | 0.781111 | 2 | 22005 | 7 | 4 | 7 | 9579.000 | 1 | beta | 2016 | 1124 | False | False | False | en-US | beta | 68 | DuckDuckGo | GB | 60 | 5 | 2 | 2394 | 256 | Intel | 3810 | 6.1 | False | 3226.048 | 2561.596 | 1346.836 | 2523.810 | 1385.350 | 935.6222 | < 5 years | Mozilla | (0,2] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 007c0c11-38e4-476b-a494-d732e15ac159 | 2 | 0 | 0.2423611 | 89.00000 | 6.6299995 | 0.000000 | 15 | 7203.000 | 1.000000 | 6.500000 | 9.500 | 7041.667 | 1.0000000 | 0.3222222 | 99 | 9.959722 | 0 | 7203 | 1 | 11 | 12 | 9194.333 | 2 | beta | 2018 | 336 | False | False | False | en-US | beta | 68 | US | 360 | 6 | 2 | 2659 | 3072 | Intel | 3317 | 10.0 | False | 4400.155 | 7244.930 | 3711.445 | 7280.457 | 3929.289 | 1474.5607 | < 2 years | Mozilla | (4,6] | < 4GB | < 3GHz | 2 | FALSE | > 1024 | |
| 009ca4e9-874a-4c3e-983d-af0923346efb | 3 | 0 | 0.1365741 | 29.66667 | 0.2358330 | 0.000000 | 11 | 2112.000 | 1.666667 | 1.666667 | 1.000 | 4644.333 | 1.6666667 | 0.1958333 | 53 | 0.387222 | 0 | 2112 | 2 | 2 | 1 | 6964.500 | 2 | beta | 2013 | 1606 | False | False | True | en-US | beta | 68 | GB | 360 | 6 | 2 | 2594 | 256 | Intel | 3965 | 6.2 | False | 5909.683 | 11043.408 | 5398.452 | 10410.288 | 5860.450 | 4538.1379 | < 5 years | Mozilla | (4,6] | < 4GB | < 3GHz | 2 | FALSE | < 256 | |
| 0101d568-0c63-4492-9295-ed57ef78207f | 3 | 0 | 0.3435185 | 15.66667 | 24.0802773 | 0.000000 | 7 | 17.000 | 1.000000 | 1.666667 | 1.500 | 4214.833 | 0.3333333 | 0.5083333 | 19 | 39.200277 | 0 | 18 | 1 | 2 | 2 | 4215.000 | 1 | beta | 2017 | 6 | False | False | False | en-US | beta | 68 | US | -420 | 5 | 4 | 3093 | 256 | Intel | 16274 | 6.3 | True | 2306.655 | 3225.400 | 2761.783 | 3493.289 | 2141.391 | 2823.6923 | < 1 week | Mozilla | (-8,-6] | < 16GB | < 4GHz | < 4 | FALSE | < 256 | |
| 0159675e-15b0-4443-85b1-94de65455636 | 6 | 0 | 0.0946759 | 18.50000 | 10.6411113 | 1.166667 | 7 | 115.500 | 2.666667 | 3.166667 | 4.000 | 2427.250 | 1.1666667 | 0.1944444 | 33 | 29.938334 | 2 | 137 | 5 | 6 | 7 | 5234.000 | 2 | beta | 2019 | 17 | False | False | False | en-US | beta | 68 | US | -240 | 5 | 4 | 3292 | 256 | Intel | 8098 | 10.0 | True | 5100.212 | 6103.080 | 3665.146 | 6118.852 | 3707.192 | 3391.5286 | < 1 month | Mozilla | (-6,-4] | < 16GB | < 4GHz | < 4 | FALSE | < 256 |
To get introduced to our training dataset, let’s have a look on the basic information of the dataset.
| rows | columns | discrete_columns | continuous_columns | all_missing_columns | total_missing_values | complete_rows | total_observations | memory_usage |
|---|---|---|---|---|---|---|---|---|
| 302819 | 58 | 20 | 38 | 0 | 0 | 302819 | 17563502 | 135686992 |
To get introduced to our validation dataset, let’s have a look on the basic information of the dataset.
| rows | columns | discrete_columns | continuous_columns | all_missing_columns | total_missing_values | complete_rows | total_observations | memory_usage |
|---|---|---|---|---|---|---|---|---|
| 328042 | 58 | 20 | 38 | 0 | 0 | 328042 | 19026436 | 146987912 |
Let’s use glimpse function to display a vertical preview of the training dataset. So we can easily preview data type and sample data.
glimpse(df_train_f)## Observations: 302,819
## Variables: 58
## $ client_id <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days <int> 8, 5, 2, 8, 4, 6, 8, 4, 3, 5...
## $ content_crashes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours <dbl> 1.42152778, 0.83055556, 0.51...
## $ uri_count <dbl> 76.12500, 168.20000, 82.0000...
## $ session_length <dbl> 22.93374988, 2.43905560, 0.8...
## $ search_count <dbl> 1.875000, 0.800000, 2.500000...
## $ num_bookmarks <dbl> 11.00, 248.75, 9.00, 87.00, ...
## $ num_pages <dbl> 4014.5000, 20599.5000, 87.00...
## $ daily_unique_domains <dbl> 6.062500, 7.866667, 2.166667...
## $ daily_max_tabs <dbl> 7.000000, 3.400000, 4.000000...
## $ daily_tabs_opened <dbl> 15.000000, 14.800000, 8.0000...
## $ startup_ms <dbl> 54176.2500, 3164.0667, 23977...
## $ daily_num_sessions_started <dbl> 0.6250000, 1.2000000, 5.0000...
## $ active_hours_max <dbl> 3.3208333, 1.6708333, 0.8250...
## $ uri_count_max <int> 139, 325, 145, 210, 255, 323...
## $ session_length_max <dbl> 37.007500, 8.318333, 1.46444...
## $ search_count_max <int> 6, 3, 5, 10, 1, 12, 29, 0, 0...
## $ num_pages_max <dbl> 15464.0, 20719.0, 87.0, 9044...
## $ daily_unique_domains_max <dbl> 17.000000, 17.000000, 3.3333...
## $ daily_max_tabs_max <int> 11, 4, 6, 12, 27, 18, 18, 2,...
## $ daily_tabs_opened_max <int> 29, 33, 15, 37, 36, 35, 170,...
## $ startup_ms_max <dbl> 180095.000, 4966.000, 31918....
## $ daily_num_sessions_started_max <int> 2, 2, 9, 2, 3, 1, 6, 1, 2, 3...
## $ label <fct> beta, beta, beta, beta, beta...
## $ install_year <dbl> 2016, 2016, 2019, 2018, 2018...
## $ profile_age <dbl> 1160, 1079, 745, 130, 293, 5...
## $ fxa_configured <fct> False, False, False, True, F...
## $ sync_configured <fct> False, False, False, True, F...
## $ is_default_browser <fct> True, False, True, True, Fal...
## $ locale <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel <fct> beta, beta, beta, beta, beta...
## $ app_version <dbl> 67, 67, 67, 67, 67, 67, 67, ...
## $ default_search_engine <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country <fct> US, GB, GB, US, US, GB, GB, ...
## $ timezone_offset <int> -240, 60, 60, -240, 360, 60,...
## $ num_addons <dbl> 8.00, 6.00, 6.00, 10.00, 7.0...
## $ cpu_cores <dbl> 2, 2, 4, 4, 2, 4, 2, 2, 2, 2...
## $ cpu_speed_mhz <dbl> 2527, 2394, 2394, 3991, 2659...
## $ cpu_l2_cache_kb <dbl> 256, 256, 256, 256, 3072, 25...
## $ cpu_vendor <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb <int> 4022, 3810, 8124, 16235, 331...
## $ os_version <ord> 6.1, 6.1, 10.0, 10.0, 10.0, ...
## $ is_wow64 <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT <dbl> 4223.0885, 2148.3495, 2699.8...
## $ TIME_TO_DOM_COMPLETE_MS <dbl> 5220.036, 2253.526, 2216.994...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 9079.1364, 1159.9791, 1832.6...
## $ TIME_TO_LOAD_EVENT_END_MS <dbl> 5221.7525, 2146.8265, 2119.2...
## $ TIME_TO_DOM_INTERACTIVE_MS <dbl> 6157.8399, 1155.0503, 1819.6...
## $ TIME_TO_NON_BLANK_PAINT_MS <dbl> 5198.1300, 1015.9784, 1979.7...
## $ profile_age_cat <ord> < 5 years, < 5 years, < 5 ye...
## $ distro_id_norm <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat <fct> "(-6,-4]", "(0,2]", "(0,2]",...
## $ memory_cat <ord> < 4GB, < 4GB, < 16GB, < 16GB...
## $ cpu_speed_cat <ord> < 3GHz, < 3GHz, < 3GHz, < 4G...
## $ cpu_cores_cat <ord> 2, 2, < 4, < 4, 2, < 4, 2, 2...
## $ is_release <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat <fct> < 256, < 256, < 256, < 256, ...
If we want to get some metrics about data types, zeros, infinite numbers, and missing values, we can use the df_status function.
kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| variable | q_zeros | p_zeros | q_na | p_na | q_inf | p_inf | type | unique |
|---|---|---|---|---|---|---|---|---|
| client_id | 0 | 0.00 | 0 | 0 | 0 | 0 | character | 302805 |
| num_active_days | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 8 |
| content_crashes | 302819 | 100.00 | 0 | 0 | 0 | 0 | integer | 1 |
| active_hours | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 180711 |
| uri_count | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 18610 |
| session_length | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 289472 |
| search_count | 80328 | 26.53 | 0 | 0 | 0 | 0 | numeric | 930 |
| num_bookmarks | 472 | 0.16 | 0 | 0 | 0 | 0 | numeric | 19727 |
| num_pages | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 179391 |
| daily_unique_domains | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 66058 |
| daily_max_tabs | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 2888 |
| daily_tabs_opened | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 4645 |
| startup_ms | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 276270 |
| daily_num_sessions_started | 3100 | 1.02 | 0 | 0 | 0 | 0 | numeric | 654 |
| active_hours_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 39216 |
| uri_count_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 3642 |
| session_length_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 188078 |
| search_count_max | 80328 | 26.53 | 0 | 0 | 0 | 0 | integer | 140 |
| num_pages_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 76011 |
| daily_unique_domains_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1996 |
| daily_max_tabs_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 541 |
| daily_tabs_opened_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 853 |
| startup_ms_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 145192 |
| daily_num_sessions_started_max | 3100 | 1.02 | 0 | 0 | 0 | 0 | integer | 85 |
| label | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| install_year | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 21 |
| profile_age | 6320 | 2.09 | 0 | 0 | 0 | 0 | numeric | 4127 |
| fxa_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| sync_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| is_default_browser | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| locale | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| normalized_channel | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| app_version | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1 |
| default_search_engine | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 6 |
| country | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| timezone_offset | 736 | 0.24 | 0 | 0 | 0 | 0 | integer | 35 |
| num_addons | 53 | 0.02 | 0 | 0 | 0 | 0 | numeric | 2124 |
| cpu_cores | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 27 |
| cpu_speed_mhz | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1232 |
| cpu_l2_cache_kb | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 8 |
| cpu_vendor | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 3 |
| memory_mb | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 5893 |
| os_version | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| is_wow64 | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| FX_PAGE_LOAD_MS_2_PARENT | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 294778 |
| TIME_TO_DOM_COMPLETE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300964 |
| TIME_TO_DOM_CONTENT_LOADED_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300503 |
| TIME_TO_LOAD_EVENT_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 301045 |
| TIME_TO_DOM_INTERACTIVE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300036 |
| TIME_TO_NON_BLANK_PAINT_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 296802 |
| profile_age_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| distro_id_norm | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
| timezone_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 13 |
| memory_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| cpu_speed_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| cpu_cores_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| is_release | 59627 | 19.69 | 0 | 0 | 0 | 0 | logical | 2 |
| cpu_l2_cache_kb_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
q_zeros: quantity of zeros (p_zeros: in percent)q_inf: quantity of infinite values (p_inf: in percent)q_na: quantity of NA (p_na: in percent)type: factor, ordered-factor, numeric, integer or characterunique: quantity of unique valuesLet’s use glimpse function to display a vertical preview of the validation dataset. So we can easily preview data type and sample data.
glimpse(df_validate_f)## Observations: 328,042
## Variables: 58
## $ client_id <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days <int> 8, 2, 2, 3, 3, 6, 1, 4, 7, 4...
## $ content_crashes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours <dbl> 1.53697917, 0.18333333, 0.24...
## $ uri_count <dbl> 71.125000, 43.500000, 89.000...
## $ session_length <dbl> 23.6797916, 0.5619445, 6.629...
## $ search_count <dbl> 3.3750000, 1.5000000, 0.0000...
## $ num_bookmarks <dbl> 11.0, 259.0, 15.0, 11.0, 7.0...
## $ num_pages <dbl> 1890.5714, 22005.0000, 7203....
## $ daily_unique_domains <dbl> 9.216667, 5.000000, 1.000000...
## $ daily_max_tabs <dbl> 8.500000, 3.000000, 6.500000...
## $ daily_tabs_opened <dbl> 16.625, 6.000, 9.500, 1.000,...
## $ startup_ms <dbl> 9928.4833, 5413.5000, 7041.6...
## $ daily_num_sessions_started <dbl> 0.6250000, 1.0000000, 1.0000...
## $ active_hours_max <dbl> 2.32500000, 0.21527778, 0.32...
## $ uri_count_max <int> 120, 48, 99, 53, 19, 33, 32,...
## $ session_length_max <dbl> 34.020000, 0.781111, 9.95972...
## $ search_count_max <int> 6, 2, 0, 0, 0, 2, 0, 5, 1, 1...
## $ num_pages_max <dbl> 2094, 22005, 7203, 2112, 18,...
## $ daily_unique_domains_max <dbl> 20.000000, 7.000000, 1.00000...
## $ daily_max_tabs_max <int> 14, 4, 11, 2, 2, 6, 8, 3, 6,...
## $ daily_tabs_opened_max <int> 31, 7, 12, 1, 2, 7, 13, 71, ...
## $ startup_ms_max <dbl> 17491.667, 9579.000, 9194.33...
## $ daily_num_sessions_started_max <int> 3, 1, 2, 2, 1, 2, 2, 2, 2, 1...
## $ label <fct> beta, beta, beta, beta, beta...
## $ install_year <dbl> 2016, 2016, 2018, 2013, 2017...
## $ profile_age <dbl> 1204, 1124, 336, 1606, 6, 17...
## $ fxa_configured <fct> False, False, False, False, ...
## $ sync_configured <fct> False, False, False, False, ...
## $ is_default_browser <fct> True, False, False, True, Fa...
## $ locale <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel <fct> beta, beta, beta, beta, beta...
## $ app_version <dbl> 68, 68, 68, 68, 68, 68, 68, ...
## $ default_search_engine <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country <fct> US, GB, US, GB, US, US, US, ...
## $ timezone_offset <int> -240, 60, 360, 360, -420, -2...
## $ num_addons <dbl> 7.0, 5.0, 6.0, 6.0, 5.0, 5.0...
## $ cpu_cores <dbl> 2, 2, 2, 2, 4, 4, 1, 2, 4, 3...
## $ cpu_speed_mhz <dbl> 2527, 2394, 2659, 2594, 3093...
## $ cpu_l2_cache_kb <dbl> 256, 256, 3072, 256, 256, 25...
## $ cpu_vendor <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb <int> 4022, 3810, 3317, 3965, 1627...
## $ os_version <ord> 6.1, 6.1, 10.0, 6.2, 6.3, 10...
## $ is_wow64 <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT <dbl> 3133.947, 3226.048, 4400.155...
## $ TIME_TO_DOM_COMPLETE_MS <dbl> 3713.308, 2561.596, 7244.930...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 3748.4715, 1346.8361, 3711.4...
## $ TIME_TO_LOAD_EVENT_END_MS <dbl> 3730.944, 2523.810, 7280.457...
## $ TIME_TO_DOM_INTERACTIVE_MS <dbl> 2444.7985, 1385.3500, 3929.2...
## $ TIME_TO_NON_BLANK_PAINT_MS <dbl> 1972.6632, 935.6222, 1474.56...
## $ profile_age_cat <ord> < 5 years, < 5 years, < 2 ye...
## $ distro_id_norm <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat <fct> "(-6,-4]", "(0,2]", "(4,6]",...
## $ memory_cat <ord> < 4GB, < 4GB, < 4GB, < 4GB, ...
## $ cpu_speed_cat <ord> < 3GHz, < 3GHz, < 3GHz, < 3G...
## $ cpu_cores_cat <ord> 2, 2, 2, 2, < 4, < 4, 1, 2, ...
## $ is_release <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat <fct> < 256, < 256, > 1024, < 256,...
If we want to get some metrics about data types, zeros, infinite numbers, and missing values, we can use the df_status function.
kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| variable | q_zeros | p_zeros | q_na | p_na | q_inf | p_inf | type | unique |
|---|---|---|---|---|---|---|---|---|
| client_id | 0 | 0.00 | 0 | 0 | 0 | 0 | character | 302805 |
| num_active_days | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 8 |
| content_crashes | 302819 | 100.00 | 0 | 0 | 0 | 0 | integer | 1 |
| active_hours | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 180711 |
| uri_count | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 18610 |
| session_length | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 289472 |
| search_count | 80328 | 26.53 | 0 | 0 | 0 | 0 | numeric | 930 |
| num_bookmarks | 472 | 0.16 | 0 | 0 | 0 | 0 | numeric | 19727 |
| num_pages | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 179391 |
| daily_unique_domains | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 66058 |
| daily_max_tabs | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 2888 |
| daily_tabs_opened | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 4645 |
| startup_ms | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 276270 |
| daily_num_sessions_started | 3100 | 1.02 | 0 | 0 | 0 | 0 | numeric | 654 |
| active_hours_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 39216 |
| uri_count_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 3642 |
| session_length_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 188078 |
| search_count_max | 80328 | 26.53 | 0 | 0 | 0 | 0 | integer | 140 |
| num_pages_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 76011 |
| daily_unique_domains_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1996 |
| daily_max_tabs_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 541 |
| daily_tabs_opened_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 853 |
| startup_ms_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 145192 |
| daily_num_sessions_started_max | 3100 | 1.02 | 0 | 0 | 0 | 0 | integer | 85 |
| label | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| install_year | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 21 |
| profile_age | 6320 | 2.09 | 0 | 0 | 0 | 0 | numeric | 4127 |
| fxa_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| sync_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| is_default_browser | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| locale | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| normalized_channel | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| app_version | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1 |
| default_search_engine | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 6 |
| country | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| timezone_offset | 736 | 0.24 | 0 | 0 | 0 | 0 | integer | 35 |
| num_addons | 53 | 0.02 | 0 | 0 | 0 | 0 | numeric | 2124 |
| cpu_cores | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 27 |
| cpu_speed_mhz | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1232 |
| cpu_l2_cache_kb | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 8 |
| cpu_vendor | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 3 |
| memory_mb | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 5893 |
| os_version | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| is_wow64 | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| FX_PAGE_LOAD_MS_2_PARENT | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 294778 |
| TIME_TO_DOM_COMPLETE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300964 |
| TIME_TO_DOM_CONTENT_LOADED_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300503 |
| TIME_TO_LOAD_EVENT_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 301045 |
| TIME_TO_DOM_INTERACTIVE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300036 |
| TIME_TO_NON_BLANK_PAINT_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 296802 |
| profile_age_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| distro_id_norm | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
| timezone_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 13 |
| memory_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| cpu_speed_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| cpu_cores_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| is_release | 59627 | 19.69 | 0 | 0 | 0 | 0 | logical | 2 |
| cpu_l2_cache_kb_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
q_zeros: quantity of zeros (p_zeros: in percent)q_inf: quantity of infinite values (p_inf: in percent)q_na: quantity of NA (p_na: in percent)type: factor, ordered-factor, numeric, integer or characterunique: quantity of unique valuesAre all the variables in the correct data type?
None. It seems that this has already been dealt with in preprocessing.
Any variables with lots of zeros?
Yes. Variables with lots of zeros may not be useful for modeling and, in some cases, they may dramatically bias the model. For example, the content_crashes is 100% equal to zero.
Any variables with lots of NAs?
None. Good news.
Any high cardinality variable?
Factor/categorical variables with a high number of different values (~30) tend to do overfitting if the categories have low cardinality.
df_release <- df_train_f[which(df_train_f$label == 'release'), ]
df_beta <- df_train_f[which(df_train_f$label == 'beta'), ]
f <- freq(df_train_f$label)summary(df_release)## client_id num_active_days content_crashes active_hours
## Length:243192 Min. :1.00 Min. :0 Min. :0.0000
## Class :character 1st Qu.:4.00 1st Qu.:0 1st Qu.:0.2686
## Mode :character Median :6.00 Median :0 Median :0.5744
## Mean :5.57 Mean :0 Mean :0.8469
## 3rd Qu.:8.00 3rd Qu.:0 3rd Qu.:1.1266
## Max. :8.00 Max. :0 Max. :7.1222
##
## uri_count session_length search_count num_bookmarks
## Min. : 1.00 Min. : 0.01926 Min. : 0.000 Min. : 0.00
## 1st Qu.: 44.33 1st Qu.: 2.15904 1st Qu.: 0.000 1st Qu.: 10.00
## Median : 96.67 Median : 6.33512 Median : 0.875 Median : 26.00
## Mean : 156.24 Mean : 9.28218 Mean : 2.377 Mean : 158.94
## 3rd Qu.: 197.00 3rd Qu.:13.66085 3rd Qu.: 3.000 3rd Qu.: 85.21
## Max. :2391.25 Max. :91.06639 Max. :45.750 Max. :18632.00
##
## num_pages daily_unique_domains daily_max_tabs daily_tabs_opened
## Min. : 0 Min. : 1.000 Min. : 0.625 Min. : 1.000
## 1st Qu.: 1022 1st Qu.: 2.283 1st Qu.: 2.500 1st Qu.: 4.000
## Median : 5536 Median : 3.600 Median : 3.714 Median : 8.833
## Mean : 17331 Mean : 4.968 Mean : 6.200 Mean : 17.093
## 3rd Qu.: 19681 3rd Qu.: 6.071 3rd Qu.: 6.000 3rd Qu.: 19.167
## Max. :168416 Max. :39.375 Max. :445.375 Max. :347.500
##
## startup_ms daily_num_sessions_started active_hours_max
## Min. : 261 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 1433 1st Qu.: 1.250 1st Qu.: 0.5403
## Median : 3231 Median : 2.000 Median : 1.1542
## Mean : 9832 Mean : 2.889 Mean : 1.6251
## 3rd Qu.: 8395 3rd Qu.: 3.500 3rd Qu.: 2.1903
## Max. :5358123 Max. :32.250 Max. :23.9667
##
## uri_count_max session_length_max search_count_max num_pages_max
## Min. : 1.0 Min. : 0.0306 Min. : 0.000 Min. : 0
## 1st Qu.: 86.0 1st Qu.: 4.4085 1st Qu.: 0.000 1st Qu.: 1142
## Median : 196.0 Median : 11.7017 Median : 2.000 Median : 5706
## Mean : 321.4 Mean : 18.2107 Mean : 5.434 Mean : 17519
## 3rd Qu.: 400.0 3rd Qu.: 26.1284 3rd Qu.: 7.000 3rd Qu.: 19922
## Max. :18032.0 Max. :384.2883 Max. :217.000 Max. :172543
##
## daily_unique_domains_max daily_max_tabs_max daily_tabs_opened_max
## Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 3.125 1st Qu.: 4.000 1st Qu.: 7.00
## Median : 6.000 Median : 6.000 Median : 17.00
## Mean : 8.552 Mean : 9.318 Mean : 33.27
## 3rd Qu.: 11.000 3rd Qu.: 9.000 3rd Qu.: 38.00
## Max. :100.000 Max. :2425.000 Max. :2410.00
##
## startup_ms_max daily_num_sessions_started_max label
## Min. : 271 Min. : 0.000 beta : 0
## 1st Qu.: 2310 1st Qu.: 2.000 release:243192
## Median : 5695 Median : 4.000
## Mean : 25484 Mean : 5.249
## 3rd Qu.: 16705 3rd Qu.: 6.000
## Max. :39562826 Max. :100.000
##
## install_year profile_age fxa_configured sync_configured
## Min. :2000 Min. : 0.0 False:197466 False:194974
## 1st Qu.:2016 1st Qu.: 257.0 True : 45726 True : 48218
## Median :2018 Median : 698.0
## Mean :2017 Mean : 894.7
## 3rd Qu.:2018 3rd Qu.:1374.0
## Max. :2019 Max. :6922.0
##
## is_default_browser locale normalized_channel app_version
## False:101700 en-GB: 23966 beta : 0 Min. :67
## True :141492 en-US:219226 release:243192 1st Qu.:67
## Median :67
## Mean :67
## 3rd Qu.:67
## Max. :67
##
## default_search_engine country timezone_offset
## Bing : 4442 GB: 36767 Min. :-720.0
## DuckDuckGo : 8328 US:206425 1st Qu.:-300.0
## Google :203781 Median :-240.0
## other (bundled) : 737 Mean :-238.7
## other (non-bundled): 25549 3rd Qu.:-240.0
## Yahoo : 355 Max. : 720.0
##
## num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb
## Min. : 0.000 Min. : 1.000 Min. : 792 Min. : 128
## 1st Qu.: 4.000 1st Qu.: 2.000 1st Qu.: 2261 1st Qu.: 256
## Median : 5.000 Median : 2.000 Median : 2712 Median : 256
## Mean : 5.652 Mean : 3.143 Mean : 2711 Mean : 626
## 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 3193 3rd Qu.: 512
## Max. :61.000 Max. :40.000 Max. :15077 Max. :6144
##
## cpu_vendor memory_mb os_version is_wow64
## AMD : 36898 Min. : 512 Other: 15 False:230560
## Intel:206286 1st Qu.: 4011 6.1 : 65588 True : 12632
## Other: 8 Median : 8069 6.2 : 1756
## Mean : 9444 6.3 : 13121
## 3rd Qu.: 12144 10.0 :162712
## Max. :524254
##
## FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
## Min. : 3.814 Min. : 15
## 1st Qu.: 1900.609 1st Qu.: 1728
## Median : 2657.412 Median : 2498
## Mean : 3030.073 Mean : 3291
## 3rd Qu.: 3797.899 3rd Qu.: 3878
## Max. :10000.000 Max. :50000
##
## TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
## Min. : 15.62 Min. : 15
## 1st Qu.: 1120.26 1st Qu.: 1614
## Median : 1629.36 Median : 2300
## Mean : 2290.86 Mean : 3017
## 3rd Qu.: 2592.47 3rd Qu.: 3534
## Max. :44262.02 Max. :50000
##
## TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat
## Min. : 22.89 Min. : 27.0 < 1 week : 7613
## 1st Qu.: 978.79 1st Qu.: 777.2 < 1 month : 6802
## Median : 1374.34 Median : 1095.7 < 6 months:32979
## Mean : 1796.62 Mean : 1445.7 < 2 years :77214
## 3rd Qu.: 2079.06 3rd Qu.: 1651.2 < 5 years :86054
## Max. :34688.55 Max. :100000.0 > 5 years :32530
##
## distro_id_norm timezone_cat memory_cat cpu_speed_cat
## acer : 2619 (-6,-4] :146732 < 1GB : 346 < 1GHz : 452
## Mozilla:237022 (-8,-6] : 54613 < 2GB : 5534 < 2GHz : 38361
## other : 2351 (0,2] : 35955 < 4GB : 66120 < 3GHz :123428
## Yahoo : 1200 (4,6] : 2165 < 6GB : 15982 < 4GHz : 76780
## [-12,-10]: 958 < 16GB:144433 > 16GHz: 4171
## (6,8] : 870 > 16GB: 10777
## (Other) : 1899
## cpu_cores_cat is_release cpu_l2_cache_kb_cat
## 1 : 3122 Mode:logical < 1024: 22370
## 2 :120526 TRUE:243192 < 256 :179679
## < 4 :104846 < 512 : 13262
## < 8 : 13778 > 1024: 27881
## < 16: 745
## > 16: 175
##
summary(df_beta)## client_id num_active_days content_crashes active_hours
## Length:59627 Min. :1.000 Min. :0 Min. :0.0000
## Class :character 1st Qu.:4.000 1st Qu.:0 1st Qu.:0.2250
## Mode :character Median :6.000 Median :0 Median :0.5310
## Mean :5.346 Mean :0 Mean :0.8237
## 3rd Qu.:8.000 3rd Qu.:0 3rd Qu.:1.1029
## Max. :8.000 Max. :0 Max. :7.2901
##
## uri_count session_length search_count num_bookmarks
## Min. : 1.00 Min. : 0.01667 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 37.00 1st Qu.: 2.52840 1st Qu.: 0.0000 1st Qu.: 10.0
## Median : 86.67 Median : 7.71056 Median : 0.8333 Median : 26.0
## Mean : 152.75 Mean : 12.29620 Mean : 2.4506 Mean : 242.5
## 3rd Qu.: 188.50 3rd Qu.: 19.57926 3rd Qu.: 2.8333 3rd Qu.: 96.0
## Max. :2931.00 Max. :240.80486 Max. :51.0000 Max. :40401.0
##
## num_pages daily_unique_domains daily_max_tabs
## Min. : 1 Min. : 1.000 Min. : 1.000
## 1st Qu.: 686 1st Qu.: 2.167 1st Qu.: 2.600
## Median : 4186 Median : 3.562 Median : 4.250
## Mean : 17363 Mean : 5.060 Mean : 9.604
## 3rd Qu.: 18605 3rd Qu.: 6.167 3rd Qu.: 8.000
## Max. :179658 Max. :44.000 Max. :1012.625
##
## daily_tabs_opened startup_ms daily_num_sessions_started
## Min. : 1.00 Min. : 269 Min. : 0.000
## 1st Qu.: 4.00 1st Qu.: 2102 1st Qu.: 1.000
## Median : 9.00 Median : 5088 Median : 1.667
## Mean : 20.49 Mean : 25836 Mean : 2.369
## 3rd Qu.: 21.75 3rd Qu.: 12619 3rd Qu.: 2.875
## Max. :518.25 Max. :17109506 Max. :32.833
##
## active_hours_max uri_count_max session_length_max search_count_max
## Min. : 0.000 Min. : 1 Min. : 0.0214 Min. : 0.000
## 1st Qu.: 0.450 1st Qu.: 68 1st Qu.: 4.8778 1st Qu.: 0.000
## Median : 1.064 Median : 172 Median : 14.8089 Median : 2.000
## Mean : 1.578 Mean : 311 Mean : 22.7066 Mean : 5.636
## 3rd Qu.: 2.165 3rd Qu.: 382 3rd Qu.: 31.5190 3rd Qu.: 7.000
## Max. :24.983 Max. :15626 Max. :1255.3822 Max. :188.000
##
## num_pages_max daily_unique_domains_max daily_max_tabs_max
## Min. : 1 Min. : 1.000 Min. : 1.00
## 1st Qu.: 785 1st Qu.: 3.000 1st Qu.: 4.00
## Median : 4340 Median : 5.500 Median : 6.00
## Mean : 17559 Mean : 8.744 Mean : 13.81
## 3rd Qu.: 18886 3rd Qu.: 11.000 3rd Qu.: 12.00
## Max. :180456 Max. :100.000 Max. :3149.00
##
## daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max
## Min. : 1.00 Min. : 269 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.: 3186 1st Qu.: 2.000
## Median : 17.00 Median : 8389 Median : 3.000
## Mean : 39.65 Mean : 86290 Mean : 4.281
## 3rd Qu.: 42.00 3rd Qu.: 23149 3rd Qu.: 5.000
## Max. :3302.00 Max. :106338296 Max. :88.000
##
## label install_year profile_age fxa_configured
## beta :59627 Min. :1993 Min. : 0.0 False:51826
## release: 0 1st Qu.:2017 1st Qu.: 271.0 True : 7801
## Median :2018 Median : 711.0
## Mean :2017 Mean : 893.8
## 3rd Qu.:2018 3rd Qu.:1354.0
## Max. :2019 Max. :7051.0
##
## sync_configured is_default_browser locale normalized_channel
## False:51267 False:26001 en-GB: 2581 beta :59627
## True : 8360 True :33626 en-US:57046 release: 0
##
##
##
##
##
## app_version default_search_engine country timezone_offset
## Min. :67 Bing : 833 GB:17333 Min. :-720.0
## 1st Qu.:67 DuckDuckGo : 2630 US:42294 1st Qu.:-300.0
## Median :67 Google :52211 Median :-240.0
## Mean :67 other (bundled) : 125 Mean :-143.9
## 3rd Qu.:67 other (non-bundled): 3828 3rd Qu.: 60.0
## Max. :67 Yahoo : 0 Max. : 840.0
##
## num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb
## Min. : 0.000 Min. : 1.000 Min. : 798 Min. : 128.0
## 1st Qu.: 6.000 1st Qu.: 2.000 1st Qu.: 2200 1st Qu.: 256.0
## Median : 7.000 Median : 2.000 Median : 2594 Median : 256.0
## Mean : 7.855 Mean : 2.976 Mean : 2678 Mean : 679.9
## 3rd Qu.: 8.000 3rd Qu.: 4.000 3rd Qu.: 3192 3rd Qu.: 512.0
## Max. :170.000 Max. :36.000 Max. :37214 Max. :6144.0
##
## cpu_vendor memory_mb os_version is_wow64
## AMD : 8757 Min. : 511 Other: 16 False:44733
## Intel:50820 1st Qu.: 3984 6.1 :17123 True :14894
## Other: 50 Median : 8031 6.2 : 674
## Mean : 8965 6.3 : 3785
## 3rd Qu.: 10238 10.0 :38029
## Max. :262078
##
## FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
## Min. : 1 Min. : 7
## 1st Qu.: 2028 1st Qu.: 1894
## Median : 2952 Median : 2918
## Mean : 3464 Mean : 4389
## 3rd Qu.: 4487 3rd Qu.: 5106
## Max. :10000 Max. :50000
##
## TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
## Min. : 18.17 Min. : 7
## 1st Qu.: 1217.61 1st Qu.: 1796
## Median : 1856.04 Median : 2739
## Mean : 2737.64 Mean : 4127
## 3rd Qu.: 3185.18 3rd Qu.: 4758
## Max. :50000.00 Max. :50000
##
## TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat
## Min. : 40.84 Min. : 7.0 < 1 week : 1590
## 1st Qu.: 1081.05 1st Qu.: 845.6 < 1 month : 2485
## Median : 1618.02 Median : 1251.7 < 6 months: 8173
## Mean : 2404.39 Mean : 1833.7 < 2 years :18155
## 3rd Qu.: 2738.66 3rd Qu.: 2052.5 < 5 years :21807
## Max. :50000.00 Max. :90081.0 > 5 years : 7417
##
## distro_id_norm timezone_cat memory_cat cpu_speed_cat
## acer : 3 (-6,-4]:26166 < 1GB : 378 < 1GHz : 157
## Mozilla:59615 (0,2] :15927 < 2GB : 2924 < 2GHz : 9968
## other : 7 (-8,-6]:10863 < 4GB :18788 < 3GHz :30935
## Yahoo : 2 (4,6] : 3244 < 6GB : 3558 < 4GHz :17258
## (6,8] : 1410 < 16GB:31147 > 16GHz: 1309
## (2,4] : 1119 > 16GB: 2832
## (Other): 898
## cpu_cores_cat is_release cpu_l2_cache_kb_cat
## 1 : 1224 Mode :logical < 1024: 6014
## 2 :32916 FALSE:59627 < 256 :42540
## < 4 :22776 < 512 : 3321
## < 8 : 2507 > 1024: 7752
## < 16: 180
## > 16: 24
##
df_v_release <- df_validate_f[which(df_validate_f$label == 'release'), ]
df_v_beta <- df_validate_f[which(df_validate_f$label == 'beta'), ]
f <- freq(df_validate_f$label)summary(df_v_release)## client_id num_active_days content_crashes active_hours
## Length:257697 Min. :1.00 Min. :0 Min. :0.0000
## Class :character 1st Qu.:4.00 1st Qu.:0 1st Qu.:0.2641
## Mode :character Median :6.00 Median :0 Median :0.5752
## Mean :5.71 Mean :0 Mean :0.8525
## 3rd Qu.:8.00 3rd Qu.:0 3rd Qu.:1.1399
## Max. :8.00 Max. :0 Max. :7.2205
##
## uri_count session_length search_count num_bookmarks
## Min. : 1.00 Min. : 0.01572 Min. : 0.000 Min. : 0.00
## 1st Qu.: 44.00 1st Qu.: 2.31995 1st Qu.: 0.000 1st Qu.: 10.00
## Median : 97.43 Median : 6.78083 Median : 1.000 Median : 25.33
## Mean : 158.72 Mean : 9.70679 Mean : 2.446 Mean : 158.03
## 3rd Qu.: 200.14 3rd Qu.:14.77410 3rd Qu.: 3.000 3rd Qu.: 84.00
## Max. :2483.17 Max. :90.44222 Max. :45.000 Max. :20002.14
##
## num_pages daily_unique_domains daily_max_tabs
## Min. : 0.0 Min. : 1.000 Min. : 0.5714
## 1st Qu.: 991.3 1st Qu.: 2.287 1st Qu.: 2.5000
## Median : 5308.0 Median : 3.651 Median : 3.8000
## Mean : 17089.9 Mean : 5.112 Mean : 6.3472
## 3rd Qu.: 19359.8 3rd Qu.: 6.250 3rd Qu.: 6.2857
## Max. :168812.1 Max. :42.400 Max. :449.3333
##
## daily_tabs_opened startup_ms daily_num_sessions_started
## Min. : 1.000 Min. : 239 Min. : 0.000
## 1st Qu.: 4.000 1st Qu.: 1567 1st Qu.: 1.167
## Median : 8.857 Median : 3346 Median : 2.000
## Mean : 17.187 Mean : 27279 Mean : 2.831
## 3rd Qu.: 19.500 3rd Qu.: 7770 3rd Qu.: 3.375
## Max. :357.000 Max. :22594812 Max. :32.250
##
## active_hours_max uri_count_max session_length_max search_count_max
## Min. : 0.0000 Min. : 1.0 Min. : 0.0197 Min. : 0.000
## 1st Qu.: 0.5375 1st Qu.: 86.0 1st Qu.: 4.7883 1st Qu.: 0.000
## Median : 1.1653 Median : 199.0 Median : 12.8028 Median : 3.000
## Mean : 1.6370 Mean : 328.4 Mean : 18.6141 Mean : 5.634
## 3rd Qu.: 2.2208 3rd Qu.: 409.0 3rd Qu.: 27.0975 3rd Qu.: 7.000
## Max. :25.4403 Max. :18524.0 Max. :524.5456 Max. :208.000
##
## num_pages_max daily_unique_domains_max daily_max_tabs_max
## Min. : 0 Min. : 1.000 Min. : 1.00
## 1st Qu.: 1113 1st Qu.: 3.200 1st Qu.: 4.00
## Median : 5490 Median : 6.000 Median : 6.00
## Mean : 17289 Mean : 8.837 Mean : 9.54
## 3rd Qu.: 19606 3rd Qu.: 11.000 3rd Qu.: 10.00
## Max. :170532 Max. :100.000 Max. :2215.00
##
## daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max
## Min. : 1.00 Min. : 239 Min. : 0.00
## 1st Qu.: 7.00 1st Qu.: 2618 1st Qu.: 2.00
## Median : 17.00 Median : 5997 Median : 4.00
## Mean : 33.54 Mean : 95160 Mean : 5.18
## 3rd Qu.: 38.00 3rd Qu.: 14981 3rd Qu.: 6.00
## Max. :2342.00 Max. :171712978 Max. :184.00
##
## label install_year profile_age fxa_configured
## beta : 0 Min. :2000 Min. : 0.0 False:207292
## release:257697 1st Qu.:2017 1st Qu.: 235.0 True : 50405
## Median :2018 Median : 673.0
## Mean :2017 Mean : 883.9
## 3rd Qu.:2019 3rd Qu.:1368.0
## Max. :2019 Max. :6972.0
##
## sync_configured is_default_browser locale normalized_channel
## False:205539 False:108253 en-GB: 24771 beta : 0
## True : 52158 True :149444 en-US:232926 release:257697
##
##
##
##
##
## app_version default_search_engine country timezone_offset
## Min. :68 Bing : 5142 GB: 37712 Min. :-720.0
## 1st Qu.:68 DuckDuckGo : 9598 US:219985 1st Qu.:-300.0
## Median :68 Google :225161 Median :-240.0
## Mean :68 missing : 33 Mean :-240.5
## 3rd Qu.:68 other (bundled) : 682 3rd Qu.:-240.0
## Max. :68 other (non-bundled): 16655 Max. : 780.0
## Yahoo : 426
## num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb
## Min. : 0.000 Min. : 1.000 Min. : 768 Min. : 128.0
## 1st Qu.: 4.000 1st Qu.: 2.000 1st Qu.: 2261 1st Qu.: 256.0
## Median : 5.000 Median : 3.000 Median : 2712 Median : 256.0
## Mean : 5.678 Mean : 3.192 Mean : 2713 Mean : 610.8
## 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 3193 3rd Qu.: 512.0
## Max. :71.000 Max. :50.000 Max. :28900 Max. :6144.0
##
## cpu_vendor memory_mb os_version is_wow64
## AMD : 38180 Min. : 512 Other: 16 False:244710
## Intel:219506 1st Qu.: 4021 6.1 : 65859 True : 12987
## Other: 11 Median : 8073 6.2 : 1819
## Mean : 9720 6.3 : 13387
## 3rd Qu.: 12180 10.0 :176616
## Max. :1572801
##
## FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
## Min. : 1 Min. : 28.78
## 1st Qu.: 1815 1st Qu.: 1627.44
## Median : 2520 Median : 2328.42
## Mean : 2894 Mean : 3039.55
## 3rd Qu.: 3590 3rd Qu.: 3547.01
## Max. :10000 Max. :47816.25
##
## TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
## Min. : 23.71 Min. : 28.95
## 1st Qu.: 1105.90 1st Qu.: 1542.78
## Median : 1592.19 Median : 2182.18
## Mean : 2146.62 Mean : 2837.31
## 3rd Qu.: 2481.38 3rd Qu.: 3301.45
## Max. :48375.15 Max. :47816.25
##
## TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat
## Min. : 19.86 Min. : 16.71 < 1 week : 8309
## 1st Qu.: 973.66 1st Qu.: 775.88 < 1 month : 9838
## Median : 1362.24 Median : 1093.90 < 6 months:35158
## Mean : 1761.98 Mean : 1448.36 < 2 years :81811
## 3rd Qu.: 2037.65 3rd Qu.: 1652.08 < 5 years :87688
## Max. :41404.35 Max. :100000.00 > 5 years :34893
##
## distro_id_norm timezone_cat memory_cat cpu_speed_cat
## acer : 2790 (-6,-4] :157717 < 1GB : 316 < 1GHz : 448
## Mozilla:251379 (-8,-6] : 57182 < 2GB : 5300 < 2GHz : 40596
## other : 2313 (0,2] : 36782 < 4GB : 66569 < 3GHz :130993
## Yahoo : 1215 (4,6] : 2209 < 6GB : 16419 < 4GHz : 81353
## [-12,-10]: 957 < 16GB:156709 > 16GHz: 4307
## (6,8] : 797 > 16GB: 12384
## (Other) : 2053
## cpu_cores_cat is_release cpu_l2_cache_kb_cat
## 1 : 3101 Mode:logical < 1024: 22578
## 2 :124514 TRUE:257697 < 256 :192910
## < 4 :112698 < 512 : 13930
## < 8 : 16276 > 1024: 28279
## < 16: 900
## > 16: 208
##
summary(df_v_beta)## client_id num_active_days content_crashes active_hours
## Length:70345 Min. :1.000 Min. :0 Min. :0.0000
## Class :character 1st Qu.:3.000 1st Qu.:0 1st Qu.:0.2074
## Mode :character Median :5.000 Median :0 Median :0.5028
## Mean :4.913 Mean :0 Mean :0.7988
## 3rd Qu.:7.000 3rd Qu.:0 3rd Qu.:1.0601
## Max. :8.000 Max. :0 Max. :7.5403
##
## uri_count session_length search_count num_bookmarks
## Min. : 1.00 Min. : 0.020 Min. : 0.000 Min. : 0.0
## 1st Qu.: 33.25 1st Qu.: 2.213 1st Qu.: 0.000 1st Qu.: 9.0
## Median : 80.50 Median : 7.154 Median : 0.750 Median : 23.0
## Mean : 146.33 Mean : 12.337 Mean : 2.324 Mean : 225.4
## 3rd Qu.: 179.00 3rd Qu.: 18.739 3rd Qu.: 2.667 3rd Qu.: 85.0
## Max. :2983.00 Max. :286.698 Max. :50.000 Max. :39519.0
##
## num_pages daily_unique_domains daily_max_tabs
## Min. : 0.0 Min. : 1.000 Min. : 0.400
## 1st Qu.: 543.3 1st Qu.: 2.125 1st Qu.: 2.500
## Median : 3347.5 Median : 3.500 Median : 4.125
## Mean : 15614.0 Mean : 5.148 Mean : 9.020
## 3rd Qu.: 15660.6 3rd Qu.: 6.167 3rd Qu.: 7.750
## Max. :177583.0 Max. :49.292 Max. :910.400
##
## daily_tabs_opened startup_ms daily_num_sessions_started
## Min. : 1.00 Min. : 289 Min. : 0.000
## 1st Qu.: 3.50 1st Qu.: 2219 1st Qu.: 1.000
## Median : 8.50 Median : 5016 Median : 1.667
## Mean : 20.03 Mean : 50073 Mean : 2.398
## 3rd Qu.: 21.00 3rd Qu.: 11210 3rd Qu.: 3.000
## Max. :554.00 Max. :50660200 Max. :32.000
##
## active_hours_max uri_count_max session_length_max search_count_max
## Min. : 0.0000 Min. : 1.0 Min. : 0.0411 Min. : 0.0
## 1st Qu.: 0.3917 1st Qu.: 57.0 1st Qu.: 4.0028 1st Qu.: 0.0
## Median : 0.9625 Median : 152.0 Median : 12.9614 Median : 2.0
## Mean : 1.4712 Mean : 287.3 Mean : 22.2730 Mean : 5.1
## 3rd Qu.: 2.0014 3rd Qu.: 352.0 3rd Qu.: 28.9514 3rd Qu.: 6.0
## Max. :31.1278 Max. :17548.0 Max. :922.2850 Max. :313.0
##
## num_pages_max daily_unique_domains_max daily_max_tabs_max
## Min. : 0 Min. : 1.000 Min. : 1.00
## 1st Qu.: 620 1st Qu.: 3.000 1st Qu.: 4.00
## Median : 3513 Median : 5.200 Median : 6.00
## Mean : 15780 Mean : 8.582 Mean : 12.83
## 3rd Qu.: 15872 3rd Qu.: 10.500 3rd Qu.: 11.00
## Max. :182555 Max. :100.000 Max. :1779.00
##
## daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max
## Min. : 1.0 Min. : 289 Min. : 0.000
## 1st Qu.: 5.0 1st Qu.: 3296 1st Qu.: 1.000
## Median : 15.0 Median : 7801 Median : 3.000
## Mean : 37.3 Mean : 142010 Mean : 4.141
## 3rd Qu.: 39.0 3rd Qu.: 18500 3rd Qu.: 5.000
## Max. :2551.0 Max. :251665490 Max. :110.000
##
## label install_year profile_age fxa_configured
## beta :70345 Min. :2000 Min. : 0.0 False:59564
## release: 0 1st Qu.:2017 1st Qu.: 213.0 True :10781
## Median :2018 Median : 690.0
## Mean :2017 Mean : 875.3
## 3rd Qu.:2019 3rd Qu.:1329.0
## Max. :2019 Max. :7095.0
##
## sync_configured is_default_browser locale normalized_channel
## False:58484 False:32258 en-GB: 2997 beta :70345
## True :11861 True :38087 en-US:67348 release: 0
##
##
##
##
##
## app_version default_search_engine country timezone_offset
## Min. :68 Bing : 1011 GB:20569 Min. :-720.0
## 1st Qu.:68 DuckDuckGo : 2936 US:49776 1st Qu.:-300.0
## Median :68 Google :62032 Median :-240.0
## Mean :68 missing : 35 Mean :-129.9
## 3rd Qu.:68 other (bundled) : 63 3rd Qu.: 60.0
## Max. :68 other (non-bundled): 4266 Max. : 840.0
## Yahoo : 2
## num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb
## Min. : 0.000 Min. : 1.000 Min. : 633 Min. : 128.0
## 1st Qu.: 5.000 1st Qu.: 2.000 1st Qu.: 2195 1st Qu.: 256.0
## Median : 6.000 Median : 2.000 Median : 2594 Median : 256.0
## Mean : 6.883 Mean : 2.954 Mean : 2656 Mean : 674.6
## 3rd Qu.: 7.000 3rd Qu.: 4.000 3rd Qu.: 3192 3rd Qu.: 512.0
## Max. :76.000 Max. :32.000 Max. :37221 Max. :6144.0
##
## cpu_vendor memory_mb os_version is_wow64
## AMD :10123 Min. : 511 Other: 12 False:50246
## Intel:60149 1st Qu.: 3981 6.1 :20330 True :20099
## Other: 73 Median : 7973 6.2 : 781
## Mean : 8796 6.3 : 4768
## 3rd Qu.: 8189 10.0 :44454
## Max. :294902
##
## FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
## Min. : 1 Min. : 34
## 1st Qu.: 2044 1st Qu.: 1902
## Median : 3045 Median : 3024
## Mean : 3593 Mean : 4597
## 3rd Qu.: 4727 3rd Qu.: 5481
## Max. :10000 Max. :50000
##
## TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
## Min. : 30.71 Min. : 34
## 1st Qu.: 1271.67 1st Qu.: 1816
## Median : 2001.34 Median : 2857
## Mean : 2896.94 Mean : 4376
## 3rd Qu.: 3454.47 3rd Qu.: 5204
## Max. :41450.65 Max. :50000
##
## TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat
## Min. : 26 Min. : 4.0 < 1 week : 1869
## 1st Qu.: 1143 1st Qu.: 889.2 < 1 month : 3232
## Median : 1764 Median : 1358.9 < 6 months:10897
## Mean : 2627 Mean : 2014.8 < 2 years :20628
## 3rd Qu.: 3069 3rd Qu.: 2286.5 < 5 years :25035
## Max. :49619 Max. :100000.0 > 5 years : 8684
##
## distro_id_norm timezone_cat memory_cat cpu_speed_cat
## acer : 4 (-6,-4]:29163 < 1GB : 396 < 1GHz : 217
## Mozilla:70331 (0,2] :18167 < 2GB : 4078 < 2GHz :12505
## other : 7 (-8,-6]:13002 < 4GB :22890 < 3GHz :36310
## Yahoo : 3 (4,6] : 4667 < 6GB : 4105 < 4GHz :19935
## (6,8] : 2289 < 16GB:35628 > 16GHz: 1378
## (2,4] : 1932 > 16GB: 3248
## (Other): 1125
## cpu_cores_cat is_release cpu_l2_cache_kb_cat
## 1 : 1399 Mode :logical < 1024: 7181
## 2 :39571 FALSE:70345 < 256 :50220
## < 4 :26174 < 512 : 3911
## < 8 : 2970 > 1024: 9033
## < 16: 209
## > 16: 22
##
## Frequency distribution release dataframe
plot_bar(df_release, ggtheme = theme_minimal(base_size = 15))## Frequency distribution beta dataframe
plot_bar(df_beta, ggtheme = theme_minimal(base_size = 15))## Frequency distribution release dataframe
plot_bar(df_v_release, ggtheme = theme_minimal(base_size = 15))## Frequency distribution beta dataframe
plot_bar(df_v_beta, ggtheme = theme_minimal(base_size = 15))## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15))## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15))## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15))## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15))## Training
t <- ggplot(data=df_train_f, aes(x=uri_count, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="URI Count", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=uri_count, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="URI Count", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space## Training
t <- ggplot(data=df_train_f, aes(x=active_hours, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Active Hours", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=active_hours, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Active Hours", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space## Training
t <- ggplot(data=df_train_f, aes(x=num_pages, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Num Pages", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=num_pages, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Num Pages", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space## Training
t <- ggplot(data=df_train_f, aes(x=session_length, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Session Length", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=session_length, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Session Length", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting spaceThis section will focus only on user engagement continuous metrics. So, we are going to analyze the following metrics:
num_active_daysactive_hoursactive_hours_maxuri_counturi_count_maxsession_lengthsession_length_maxsearch_countsearch_count_maxnum_bookmarksnum_pagesnum_pages_maxnum_addonsdaily_unique_domainsdaily_unique_domains_maxdaily_max_tabsdaily_max_tabs_maxdaily_tabs_openeddaily_tabs_opened_maxdaily_num_sessions_starteddaily_num_sessions_started_maxstartup_msinstall_yearprofile_agetimezone_offsetmemory_mbcpu_corescpu_speed_mhzcpu_l2_cache_kbkable(text_tbl) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| beta_num_active_days | release_num_active_days | beta_active_hours | release_active_hours | beta_active_hours_max | release_active_hours_max | beta_uri_count | release_uri_count | beta_uri_count_max | release_uri_count_max | beta_session_length | release_session_length | beta_session_length_max | release_session_length_max | beta_search_count | release_search_count | beta_search_count_max | release_search_count_max | beta_num_bookmarks | release_num_bookmarks | beta_num_pages | release_num_pages | beta_num_pages_max | release_num_pages_max | beta_daily_unique_domains | release_daily_unique_domains | beta_daily_max_tabs | release_daily_max_tabs | beta_daily_tabs_opened | release_daily_tabs_opened | beta_daily_num_sessions_started | release_daily_num_sessions_started | beta_daily_unique_domains_max | release_daily_unique_domains_max | beta_daily_max_tabs_max | release_daily_max_tabs_max | beta_daily_tabs_opened_max | release_daily_tabs_opened_max | beta_daily_num_sessions_started_max | release_daily_num_sessions_started_max | beta_startup_ms | release_startup_ms | beta_install_year | release_install_year | beta_profile_age | release_profile_age | beta_timezone_offset | release_timezone_offset | beta_memory_mb | release_memory_mb | beta_cpu_cores | release_cpu_cores | beta_cpu_speed_mhz | release_cpu_speed_mhz | beta_cpu_l2_cache_kb | release_cpu_l2_cache_kb | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. | 1.000000 | 1.000000 | 0.0000000 | 0.0000000 | 0.000000 | 0.0000000 | 1.00000 | 1.00000 | 1.0000 | 1.0000 | 0.0166665 | 0.0192595 | 0.021389 | 0.030556 | 0.0000000 | 0.000000 | 0.000000 | 0.000000 | 0.0000 | 0.0000 | 1.000 | 0.000 | 1.00 | 0.00 | 1.000000 | 1.000000 | 1.000000 | 0.625000 | 1.00000 | 1.000000 | 0.000000 | 0.000000 | 1.00000 | 1.000000 | 1.00000 | 1.000000 | 1.00000 | 1.00000 | 0.000000 | 0.000000 | 269.000 | 261.128 | 1993.000 | 2000.000 | 0.0000 | 0.0000 | -720.000 | -720.000 | 511.000 | 512.000 | 1.000000 | 1.000000 | 798.000 | 792.00 | 128.0000 | 128.0000 |
| 1st Qu. | 4.000000 | 4.000000 | 0.2250000 | 0.2686111 | 0.450000 | 0.5402778 | 37.00000 | 44.33333 | 68.0000 | 86.0000 | 2.5284028 | 2.1590431 | 4.877777 | 4.408542 | 0.0000000 | 0.000000 | 0.000000 | 0.000000 | 10.0000 | 10.0000 | 686.000 | 1022.125 | 785.00 | 1142.00 | 2.166667 | 2.283333 | 2.600000 | 2.500000 | 4.00000 | 4.000000 | 1.000000 | 1.250000 | 3.00000 | 3.125000 | 4.00000 | 4.000000 | 6.00000 | 7.00000 | 2.000000 | 2.000000 | 2102.206 | 1432.677 | 2017.000 | 2016.000 | 271.0000 | 257.0000 | -300.000 | -300.000 | 3984.000 | 4011.000 | 2.000000 | 2.000000 | 2200.000 | 2261.00 | 256.0000 | 256.0000 |
| Median | 6.000000 | 6.000000 | 0.5309524 | 0.5744444 | 1.063889 | 1.1541667 | 86.66667 | 96.66667 | 172.0000 | 196.0000 | 7.7105554 | 6.3351191 | 14.808889 | 11.701666 | 0.8333333 | 0.875000 | 2.000000 | 2.000000 | 26.0000 | 26.0000 | 4185.667 | 5536.000 | 4340.00 | 5705.50 | 3.562500 | 3.600000 | 4.250000 | 3.714286 | 9.00000 | 8.833333 | 1.666667 | 2.000000 | 5.50000 | 6.000000 | 6.00000 | 6.000000 | 17.00000 | 17.00000 | 3.000000 | 4.000000 | 5088.010 | 3231.339 | 2018.000 | 2018.000 | 711.0000 | 698.0000 | -240.000 | -240.000 | 8031.000 | 8069.000 | 2.000000 | 2.000000 | 2594.000 | 2712.00 | 256.0000 | 256.0000 |
| Mean | 5.346169 | 5.569842 | 0.8236611 | 0.8468557 | 1.577508 | 1.6251135 | 152.74550 | 156.24224 | 311.0213 | 321.3891 | 12.2961990 | 9.2821806 | 22.706568 | 18.210749 | 2.4506498 | 2.376504 | 5.636171 | 5.434352 | 242.4878 | 158.9390 | 17363.463 | 17330.600 | 17558.93 | 17518.75 | 5.060464 | 4.968328 | 9.603628 | 6.200080 | 20.49191 | 17.092979 | 2.368895 | 2.888602 | 8.74361 | 8.552061 | 13.81149 | 9.317556 | 39.64786 | 33.27059 | 4.281399 | 5.248573 | 25835.963 | 9832.051 | 2017.138 | 2017.064 | 893.7534 | 894.7365 | -143.855 | -238.714 | 8965.156 | 9443.657 | 2.975699 | 3.143089 | 2678.209 | 2710.62 | 679.9325 | 625.9611 |
| 3rd Qu. | 8.000000 | 8.000000 | 1.1028646 | 1.1265956 | 2.165278 | 2.1902778 | 188.50000 | 197.00000 | 382.0000 | 400.0000 | 19.5792560 | 13.6608531 | 31.519026 | 26.128403 | 2.8333333 | 3.000000 | 7.000000 | 7.000000 | 96.0000 | 85.2125 | 18605.464 | 19680.625 | 18885.50 | 19922.00 | 6.166667 | 6.070833 | 8.000000 | 6.000000 | 21.75000 | 19.166667 | 2.875000 | 3.500000 | 11.00000 | 11.000000 | 12.00000 | 9.000000 | 42.00000 | 38.00000 | 5.000000 | 6.000000 | 12618.764 | 8394.891 | 2018.000 | 2018.000 | 1354.0000 | 1374.0000 | 60.000 | -240.000 | 10238.000 | 12144.000 | 4.000000 | 4.000000 | 3192.000 | 3193.00 | 512.0000 | 512.0000 |
| Max. | 8.000000 | 8.000000 | 7.2901042 | 7.1222222 | 24.983333 | 23.9666667 | 2931.00000 | 2391.25000 | 15626.0000 | 18032.0000 | 240.8048605 | 91.0663890 | 1255.382223 | 384.288333 | 51.0000000 | 45.750000 | 188.000000 | 217.000000 | 40401.0000 | 18632.0000 | 179657.500 | 168416.286 | 180456.00 | 172543.00 | 44.000000 | 39.375000 | 1012.625000 | 445.375000 | 518.25000 | 347.500000 | 32.833333 | 32.250000 | 100.00000 | 100.000000 | 3149.00000 | 2425.000000 | 3302.00000 | 2410.00000 | 88.000000 | 100.000000 | 17109505.514 | 5358122.833 | 2019.000 | 2019.000 | 7051.0000 | 6922.0000 | 840.000 | 720.000 | 262078.000 | 524254.000 | 36.000000 | 40.000000 | 37214.000 | 15077.00 | 6144.0000 | 6144.0000 |
kable(text_tbl_v) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| beta_num_active_days | release_num_active_days | beta_active_hours | release_active_hours | beta_active_hours_max | release_active_hours_max | beta_uri_count | release_uri_count | beta_uri_count_max | release_uri_count_max | beta_session_length | release_session_length | beta_session_length_max | release_session_length_max | beta_search_count | release_search_count | beta_search_count_max | release_search_count_max | beta_num_bookmarks | release_num_bookmarks | beta_num_pages | release_num_pages | beta_num_pages_max | release_num_pages_max | beta_daily_unique_domains | release_daily_unique_domains | beta_daily_max_tabs | release_daily_max_tabs | beta_daily_tabs_opened | release_daily_tabs_opened | beta_daily_num_sessions_started | release_daily_num_sessions_started | beta_daily_unique_domains_max | release_daily_unique_domains_max | beta_daily_max_tabs_max | release_daily_max_tabs_max | beta_daily_tabs_opened_max | release_daily_tabs_opened_max | beta_daily_num_sessions_started_max | release_daily_num_sessions_started_max | beta_startup_ms | release_startup_ms | beta_install_year | release_install_year | beta_profile_age | release_profile_age | beta_timezone_offset | release_timezone_offset | beta_memory_mb | release_memory_mb | beta_cpu_cores | release_cpu_cores | beta_cpu_speed_mhz | release_cpu_speed_mhz | beta_cpu_l2_cache_kb | release_cpu_l2_cache_kb | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. | 1.000000 | 1.000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.000000 | 1.0000 | 1.00000 | 1.0000 | 1.0000 | 0.0199998 | 0.0157222 | 0.041111 | 0.019722 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.0000 | 0.0000 | 0.00 | 0.00 | 1.000000 | 1.000000 | 0.400000 | 0.5714286 | 1.00000 | 1.000000 | 0.000000 | 0.000000 | 1.00000 | 1.000000 | 1.00000 | 1.000000 | 1.00000 | 1.00000 | 0.000000 | 0.000000 | 289.000 | 2.388333e+02 | 2000.000 | 2000.000 | 0.0000 | 0.000 | -720.0000 | -720.0000 | 511.000 | 512.000 | 1.000000 | 1.000000 | 633.00 | 768.000 | 128.0000 | 128.0000 |
| 1st Qu. | 3.000000 | 4.000000 | 0.2074074 | 0.2640873 | 0.3916667 | 0.537500 | 33.2500 | 44.00000 | 57.0000 | 86.0000 | 2.2133335 | 2.3199533 | 4.002778 | 4.788333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.0000 | 10.00000 | 543.3333 | 991.3333 | 620.00 | 1113.00 | 2.125000 | 2.287037 | 2.500000 | 2.5000000 | 3.50000 | 4.000000 | 1.000000 | 1.166667 | 3.00000 | 3.200000 | 4.00000 | 4.000000 | 5.00000 | 7.00000 | 1.000000 | 2.000000 | 2218.863 | 1.567167e+03 | 2017.000 | 2017.000 | 213.0000 | 235.000 | -300.0000 | -300.0000 | 3981.000 | 4021.000 | 2.000000 | 2.000000 | 2195.00 | 2261.000 | 256.0000 | 256.0000 |
| Median | 5.000000 | 6.000000 | 0.5027778 | 0.5751736 | 0.9625000 | 1.165278 | 80.5000 | 97.42857 | 152.0000 | 199.0000 | 7.1543749 | 6.7808331 | 12.961389 | 12.802778 | 0.750000 | 1.000000 | 2.000000 | 3.000000 | 23.0000 | 25.33333 | 3347.5000 | 5308.0000 | 3513.00 | 5490.00 | 3.500000 | 3.651190 | 4.125000 | 3.8000000 | 8.50000 | 8.857143 | 1.666667 | 2.000000 | 5.20000 | 6.000000 | 6.00000 | 6.000000 | 15.00000 | 17.00000 | 3.000000 | 4.000000 | 5015.522 | 3.345629e+03 | 2018.000 | 2018.000 | 690.0000 | 673.000 | -240.0000 | -240.0000 | 7973.000 | 8073.000 | 2.000000 | 3.000000 | 2594.00 | 2712.000 | 256.0000 | 256.0000 |
| Mean | 4.912574 | 5.710307 | 0.7988445 | 0.8524956 | 1.4711888 | 1.636961 | 146.3339 | 158.71931 | 287.3003 | 328.3892 | 12.3367205 | 9.7067904 | 22.272966 | 18.614122 | 2.324319 | 2.446479 | 5.100206 | 5.633558 | 225.4153 | 158.03362 | 15614.0379 | 17089.9304 | 15779.79 | 17289.08 | 5.148112 | 5.112258 | 9.019717 | 6.3471824 | 20.03166 | 17.187064 | 2.398131 | 2.831202 | 8.58199 | 8.837358 | 12.82845 | 9.539785 | 37.29553 | 33.54423 | 4.141417 | 5.180103 | 50072.589 | 2.727928e+04 | 2017.255 | 2017.194 | 875.2575 | 883.857 | -129.9082 | -240.4712 | 8795.994 | 9719.802 | 2.954155 | 3.191904 | 2656.31 | 2712.603 | 674.5779 | 610.7777 |
| 3rd Qu. | 7.000000 | 8.000000 | 1.0600694 | 1.1399306 | 2.0013889 | 2.220833 | 179.0000 | 200.14286 | 352.0000 | 409.0000 | 18.7390970 | 14.7740975 | 28.951388 | 27.097500 | 2.666667 | 3.000000 | 6.000000 | 7.000000 | 85.0000 | 84.00000 | 15660.6250 | 19359.8333 | 15872.00 | 19606.00 | 6.166667 | 6.250000 | 7.750000 | 6.2857143 | 21.00000 | 19.500000 | 3.000000 | 3.375000 | 10.50000 | 11.000000 | 11.00000 | 10.000000 | 39.00000 | 38.00000 | 5.000000 | 6.000000 | 11210.000 | 7.770437e+03 | 2019.000 | 2019.000 | 1329.0000 | 1368.000 | 60.0000 | -240.0000 | 8189.000 | 12180.000 | 4.000000 | 4.000000 | 3192.00 | 3193.000 | 512.0000 | 512.0000 |
| Max. | 8.000000 | 8.000000 | 7.5402778 | 7.2204861 | 31.1277778 | 25.440278 | 2983.0000 | 2483.16667 | 17548.0000 | 18524.0000 | 286.6983330 | 90.4422220 | 922.285000 | 524.545556 | 50.000000 | 45.000000 | 313.000000 | 208.000000 | 39519.0000 | 20002.14286 | 177583.0000 | 168812.1429 | 182555.00 | 170532.00 | 49.291667 | 42.400000 | 910.400000 | 449.3333333 | 554.00000 | 357.000000 | 32.000000 | 32.250000 | 100.00000 | 100.000000 | 1779.00000 | 2215.000000 | 2551.00000 | 2342.00000 | 110.000000 | 184.000000 | 50660199.500 | 2.259481e+07 | 2019.000 | 2019.000 | 7095.0000 | 6972.000 | 840.0000 | 780.0000 | 294902.000 | 1572801.000 | 32.000000 | 50.000000 | 37221.00 | 28900.000 | 6144.0000 | 6144.0000 |
The QQ plot can be used to compare two continuous distributions.
par(mfrow = c(2, 2)) ## Set up a 2 x 2 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng) {
x <- df_beta_ue[,i]
y <- df_release_ue[,i]
rg <- range(x, y, na.rm=T)
test <- ks.test(x, y)$statistic
pvalue <- ks.test(x, y)$p.value
test <- paste("KS Test = ", round(test, 3))
pvalue <- paste("P-value = ", round(pvalue, 3))
qqplot(x, y, main=i, xlim=rg, ylim=rg, xlab = "Beta", ylab = "Release", pch = 1)
# mtext(test, side=3)
text(min(x), max(x), paste(pvalue, "\n", test), adj=c(0,1))
abline(0,1, col="red")
}for (i in user_eng) {
x <- df_beta_ue[,i]
y <- df_release_ue[,i]
print(i)
# print(ad.test(x))
# print(ad.test(y))
print(ks.test(x, y))
}## [1] "num_active_days"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.070619, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "active_hours"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.04493, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "active_hours_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.044246, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "uri_count"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.047413, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "uri_count_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.053957, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "session_length"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.098562, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "session_length_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.073339, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "search_count"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.014407, p-value = 4.652e-09
## alternative hypothesis: two-sided
##
## [1] "search_count_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.019136, p-value = 1.221e-15
## alternative hypothesis: two-sided
##
## [1] "num_bookmarks"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.026078, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_pages"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.05463, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_pages_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.053836, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_addons"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.60116, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_unique_domains"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.029154, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_unique_domains_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.03293, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_max_tabs"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.09362, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_max_tabs_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.07981, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_tabs_opened"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.034583, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_tabs_opened_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.032014, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_num_sessions_started"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.11891, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_num_sessions_started_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.11591, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "startup_ms"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.12852, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "install_year"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.018028, p-value = 6.062e-14
## alternative hypothesis: two-sided
##
## [1] "profile_age"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.041815, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "timezone_offset"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.20833, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "memory_mb"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.074939, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "cpu_cores"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.064122, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "cpu_speed_mhz"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.0464, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "cpu_l2_cache_kb"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.025401, p-value < 2.2e-16
## alternative hypothesis: two-sided
The QQ plot can be used to compare two continuous distributions.
par(mfrow = c(2, 2)) ## Set up a 2 x 2 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng) {
x <- df_beta_v_ue[,i]
y <- df_release_v_ue[,i]
rg <- range(x, y, na.rm=T)
test <- ks.test(x, y)$statistic
pvalue <- ks.test(x, y)$p.value
test <- paste("KS Test = ", round(test, 3))
pvalue <- paste("P-value = ", round(pvalue, 3))
qqplot(x, y, main=i, xlim=rg, ylim=rg, xlab = "Beta", ylab = "Release", pch = 1)
# mtext(test, side=3)
text(min(x), max(x), paste(pvalue, "\n", test), adj=c(0,1))
abline(0,1, col="red")
}for (i in user_eng) {
x <- df_beta_v_ue[,i]
y <- df_release_v_ue[,i]
print(i)
# print(ad.test(x))
# print(ad.test(y))
print(ks.test(x, y))
}## [1] "num_active_days"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.16585, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "active_hours"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.057657, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "active_hours_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.076585, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "uri_count"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.07132, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "uri_count_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.094754, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "session_length"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.075533, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "session_length_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.039646, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "search_count"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.045374, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "search_count_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.058507, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_bookmarks"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.042296, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_pages"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.081639, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_pages_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.082435, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "num_addons"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.29246, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_unique_domains"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.039839, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_unique_domains_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.054168, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_max_tabs"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.074689, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_max_tabs_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.050656, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_tabs_opened"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.036161, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_tabs_opened_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.052805, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_num_sessions_started"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.097817, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "daily_num_sessions_started_max"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.11694, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "startup_ms"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.1303, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "install_year"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.016919, p-value = 3.642e-14
## alternative hypothesis: two-sided
##
## [1] "profile_age"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.035079, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "timezone_offset"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.23513, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "memory_mb"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.10938, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "cpu_cores"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.087202, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "cpu_speed_mhz"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.061914, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## [1] "cpu_l2_cache_kb"
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.034682, p-value < 2.2e-16
## alternative hypothesis: two-sided
num_active_days, uri_count_max and num_addonsThis section will focus only on user engagement discrete metrics. So, we are going to analyze the following metrics:
default_search_engineis_default_browserprofile_age_catdistro_id_normmemory_catcpu_speed_catcpu_cores_catcpu_l2_cache_kb_catcpu_vendoros_versionis_wow64fxa_configuredsync_configuredlocalecountrytimezone_catlabelnormalized_channelis_releasepar(mfrow = c(2, 2)) ## Set up a 2 x 2 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
x <- df_beta_ue_dis[,i]
y <- df_release_ue_dis[,i]
rel_beta <- table(x)/nrow(df_beta_ue_dis) #divide the frequency counts by the total
beta_bar <- barplot(rel_beta,
main = "Beta", #Give your chart a title
ylim=c(0,1), border=F, col = "navy",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1)
rel_release <- table(y)/nrow(df_release_ue_dis) #divide the frequency counts by the total
release_bar <- barplot(rel_release,
main = "Release", #Give your chart a title
ylim=c(0,1), border=F, col = "navy",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1)
}par(mfrow = c(2, 2)) ## Set up a 2 x 2 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
x <- df_beta_v_ue_dis[,i]
y <- df_release_v_ue_dis[,i]
rel_beta <- table(x)/nrow(df_beta_v_ue_dis) #divide the frequency counts by the total
beta_bar <- barplot(rel_beta,
main = "Beta", #Give your chart a title
ylim=c(0,1), border=F, col = "navy",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1)
rel_release <- table(y)/nrow(df_release_v_ue_dis) #divide the frequency counts by the total
release_bar <- barplot(rel_release,
main = "Release", #Give your chart a title
ylim=c(0,1), border=F, col = "navy",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1)
}